import tensorflow as tf
from scipy.sparse import load_npz
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics import mean_absolute_error, mean_squared_error


def model_rep(y_true, y_pred, average="micro"):
    p = precision_score(y_true, y_pred, average=average)
    r = recall_score(y_true, y_pred, average=average)
    f1score = f1_score(y_true, y_pred, average=average)
    return p, r, f1score


def eu_distances(array1, array2):
    res = 0.0
    for idx in range(len(array1)):
        res = res + (array1[idx] - array2[idx]) ** 2
    return res / len(array1)


def average_distance(y_true_prod, y_pred_prod):
    res = 0.0
    for idx in range(len(y_true_prod)):
        res = res + eu_distances(y_true_prod[idx], y_pred_prod[idx])
    return res / len(y_true_prod)


def load_dataset(x_path, y_path):
    y_ = []
    y_oh = []
    idx = 0
    with open(y_path, encoding="utf8") as f:
        for line in f:
            ll = line.strip().split(",")
            y_item = max([idx * int(float(ll[idx])) for idx in range(5)]) + 1
            y_oh_item = [int(float(item)) for item in ll[:5]]
            y_.append([y_item])
            y_oh.append(y_oh_item)
            idx = idx + 1
    x_ = load_npz(x_path)
    return x_, y_, y_oh


def get_batch_csr(data, start_id, end_id):
    result = []
    for idx in range(int(start_id), int(end_id)):
        result = result + data.getrow(idx).toarray().astype(np.float32).tolist()
        # result = np.append(result, data.getrow(idx).toarray().astype(np.float32), axis=0)
    return result


def exp_score(y_pred_prod):
    exp_get = 0.0
    for idx in range(len(y_pred_prod)):
        exp_get = exp_get + y_pred_prod[idx] * (idx + 1)
    return exp_get


def exp_score_batch(y_pred_prod):
    exp_get = []
    for idx in range(len(y_pred_prod)):
        exp_get.append(exp_score(y_pred_prod[idx]))
    return exp_get


def pred_max(y_pred_prod):
    pred_max_get = []
    for idx in range(len(y_pred_prod)):
        pred_max_get.append(y_pred_prod[idx].index(max(y_pred_prod[idx])) + 1)
    return pred_max_get


GEN_DATA_TRAIN_PATH = "../../data/ml-1m_20190508/TRAIN_2019050901_gen.csv"
GEN_DATA_TEST_PATH = "../../data/ml-1m_20190508/TEST_2019050901_gen.csv"
GEN_DATA_TRAIN_X_PATH = "../../data/ml-1m_20190508/TRAIN_X_gen_2019050901.npz"
GEN_DATA_TRAIN_Y_PATH = "../../data/ml-1m_20190508/TRAIN_Y_gen_2019050901.csv"
GEN_DATA_TEST_X_PATH = "../../data/ml-1m_20190508/TEST_X_gen_2019050901.npz"
GEN_DATA_TEST_Y_PATH = "../../data/ml-1m_20190508/TEST_Y_gen_2019050901.csv"
PRINT_RESULT = "%s result get:\nprecision: %s,\nrecall: %s,\nf1: %s,\nMAE: %s,\nMSE: %s,\naverage distance: %s, \nexp_MAE: %s, \nexp_MSE: %s\n---------------\n"

EPOCH = 20
BATCH_SIZE = 128.0
LEARNING_RATE = 0.01

# 读取数据
x_train, y_train, y_train_oh = load_dataset(GEN_DATA_TRAIN_X_PATH, GEN_DATA_TRAIN_Y_PATH)
print("read train data_done")

# 数据分批
data_len = len(y_train_oh)

in_size = x_train.shape[1]
out_size = len(y_train_oh[0])
# out_size = 1

x = tf.placeholder(tf.float32, [None, in_size], name="x")
y = tf.placeholder(tf.float32, [None, out_size], name="y")
learning_rate = tf.placeholder(tf.float32)
# y = tf.placeholder(tf.float32, [None, 1], name="y")

W1 = tf.Variable(tf.random_normal([in_size, out_size]), name='W1')
b1 = tf.Variable(tf.zeros([out_size]), name='b1')

pred = tf.nn.softmax(tf.matmul(x, W1) + b1)


# cal = tf.square(pred - y)
# tf_sum = tf.reduce_sum(tf.square(pred - y), axis=1)
# loss = tf.reduce_mean(tf_sum)
# loss = tf.reduce_mean(tf.square(pred - y))
loss = tf.reduce_mean(-tf.reduce_sum(y*tf.log(pred), reduction_indices=1))
train_step = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss)

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    x_data = []
    y_data = []
    BATCH_LEN = int(data_len / BATCH_SIZE) + 1
    for epoch_cur in range(EPOCH):
        cur_LR = LEARNING_RATE * (10 ** (-int(epoch_cur/5)))
        for idx in range(BATCH_LEN):
            x_data = get_batch_csr(x_train, idx * BATCH_SIZE, min((idx + 1) * BATCH_SIZE, data_len))
            y_data = y_train_oh[idx * int(BATCH_SIZE): min((idx + 1) * int(BATCH_SIZE), int(data_len))]
            sess.run(train_step, feed_dict={x: x_data, y: y_data, learning_rate: cur_LR})
            if idx % 500 == 0:
                print("%s th batch, loss: %s" % (idx, sess.run(loss, feed_dict={x: x_data, y: y_data})))
                print(y_data[0:2])
                print(sess.run(pred, feed_dict={x: x_data[0:2]}).tolist())
            # if idx > 1000:
            #     break

    # 进行测试
    # x_test, y_test, y_test_oh = load_dataset(GEN_DATA_TEST_X_PATH, GEN_DATA_TEST_Y_PATH)
    x_test, y_test, y_test_oh = load_dataset(GEN_DATA_TRAIN_X_PATH, GEN_DATA_TEST_Y_PATH)
    y_pred_exp = []
    y_pred_oh = []
    y_pred_max = []
    BATCH_LEN = int(len(y_test_oh) / BATCH_SIZE) + 1
    for idx in range(BATCH_LEN):
        x_data = get_batch_csr(x_test, idx * BATCH_SIZE, min((idx + 1) * BATCH_SIZE, len(y_test_oh)))
        y_data = y_test_oh[idx * int(BATCH_SIZE): min((idx + 1) * int(BATCH_SIZE), int(data_len))]
        pred_get = sess.run(pred, feed_dict={x: x_data}).tolist()
        y_pred_oh = y_pred_oh + pred_get
        y_pred_exp = y_pred_exp + exp_score_batch(pred_get)
        y_pred_max = y_pred_max + pred_max(pred_get)
        if idx % 500 == 0:
            print("%s th batch" % (idx))
        # if idx > 100:
        #     break
# print(len(y_test), len(y_pred_max))

p, r, f1score = model_rep(y_test, y_pred_max)           # 分类预测结果
mae = mean_absolute_error(y_test, y_pred_max)           # mae
mse = mean_squared_error(y_test, y_pred_max)            # mse
ad = average_distance(y_test_oh, y_pred_oh)            # 平均距离
exp_mae = mean_absolute_error(y_test, y_pred_exp)       # 期望分数MAE
exp_mse = mean_squared_error(y_test, y_pred_exp)        # 期望分数MSE
print(PRINT_RESULT % ("test", p, r, f1score, mae, mse, ad, exp_mae, exp_mse))
